# Install required libraries
packages <- c('leaflet','dplyr','data.table','sp', 'rgeos', 'raster',
'rgdal','GISTools','magrittr','BSDA', 'PASWR','broom','tidyverse','gtools')
for(p in packages){
if(!require(p,character.only = T)){
install.packages(p)
}
library(p,character.only = T)
}
'%!in%' <- function(x,y)!('%in%'(x,y))
# Install required libraries
data <- read.csv("data/realis2018.csv")
One day, your mum tells you that we just won the first prize of TOTO which worth 2,500,000 SGD. After you discuss with your family, you decide to buy a flat and plan for the investment. During the period of time, you contact the company “Property Master@” to discuss more on the historical transaction of Singapore property. The sample data givenis “realis2018.csv”. For problems stated below, we use α= 5%.
# filter no. of units = 1 cos some transactions are the whole damn building
data %<>% filter(No..of.Units==1)
# recode YISHUN and Yishun
data$Planning.Area <- recode(data$Planning.Area,"YISHUN"="Yishun")
You look around few different planning areas (Column R). When you ask the agent what is the mean Unit price(psm) for Newton flats (Column G), she claims that the mean is higher than 26500. Do you agree with your agent’s suggestion? Explain and justify your answer.
Newton <- data %>% filter(Planning.Area=="Newton",Property.Type %in% c("Condominium", "Apartment", "Executive Condominium"), No..of.Units == "1")
z.test(Newton$Unit.Price....psm, alternative = "greater", mu= 26500,sigma.x=sd(Newton$Unit.Price....psm))
One-sample z-Test
data: Newton$Unit.Price....psm
z = 1.881, p-value = 0.02998
alternative hypothesis: true mean is greater than 26500
95 percent confidence interval:
26598.75 Inf
sample estimates:
mean of x
27286.51
At 95% Confidence level, we have sufficient evidence to say that the mean price per square meter(psm) of flats in the Newton Area is more than $26500.
Your friend told you that Newton planning area may not be the best area to choose. He suggested you to consider other planning areas. This is a very difficult decision since you need to conduct a more comprehensive analysis and you also need to justify whether you still choose Newton or another planning area.
realis <- fread('data/realis2018.csv')
realis$pa <- toupper(realis$`Planning Area`)
centroids <- readOGR('data/DGP Centroids.shp')
OGR data source with driver: ESRI Shapefile
Source: "/Users/ellpeeaxe/Desktop/SMU/Y4S2/ASSR/Project/TakeHome/data/DGP Centroids.shp", layer: "DGP Centroids"
with 52 features
It has 3 fields
centroids <- spTransform(centroids, CRS("+proj=moll +ellps=WGS84"))
dgp <- readOGR("data/MP14_PLNG_AREA_WEB_PL.shp")
OGR data source with driver: ESRI Shapefile
Source: "/Users/ellpeeaxe/Desktop/SMU/Y4S2/ASSR/Project/TakeHome/data/MP14_PLNG_AREA_WEB_PL.shp", layer: "MP14_PLNG_AREA_WEB_PL"
with 55 features
It has 12 fields
dgp <- spTransform(dgp, CRS("+proj=longlat +ellps=GRS80"))
one_unit <- subset(realis, realis$`No. of Units` == 1 & realis$`Transacted Price ($)` <= 2500000)
pa_units <- aggregate(realis$`No. of Units`,
by = list(realis$pa),
FUN = sum)
colnames(pa_units) = c('PA', 'Units')
m <- merge(dgp,pa_units, by.x ='PLN_AREA_N', by.y = 'PA')
pal <-
colorBin(palette = brewer.pal(10,"YlGnBu"),
domain = c(0,2000),
na.color = "#00000000",
bins=c(0,5,10,50,100,200,400,600,800,1000,1200,1400,1600,1800,2000))
n too large, allowed maximum for palette YlGnBu is 9
Returning the palette you asked for with that many colors
leaflet(dgp) %>% addTiles() %>% addPolygons(fillColor = ~pal(m$Units), weight = 2,
opacity = 1, color = "grey",
dashArray = "1", fillOpacity = 0.8) %>%
addLegend("topright", pal, values=(0:2000), title = "Transacted Units", labFormat = labelFormat(suffix = " Units", between = '-')) %>%
addLabelOnlyMarkers(data = centroids, lng = ~cen_x_dgp, lat = ~cen_y_dgp, label = ~DGP,
labelOptions = labelOptions(noHide = TRUE, direction = 'center', textOnly = TRUE))
pa_units[order(-pa_units$Units),]
stock_data <- fread("data/stock2019Q4.csv")
stock_data$PA <- toupper(stock_data$PA)
stock <- merge(dgp,stock_data, by.x ='PLN_AREA_N', by.y = 'PA')
pal <-
colorBin(palette = brewer.pal(10,"YlGnBu"),
domain = c(0,2000),
na.color = "#00000000",
bins=c(0,500,1000,3000,5000,8000,10000,15000,20000,30000))
n too large, allowed maximum for palette YlGnBu is 9
Returning the palette you asked for with that many colors
leaflet(dgp) %>% addTiles() %>%
addPolygons(fillColor = ~pal(stock$Total), weight = 2,
opacity = 1, color = "grey",
dashArray = "1", fillOpacity = 0.8) %>%
addLegend("topright", pal, values=(0:2000),
title = "Total Stock",
labFormat = labelFormat(suffix = " Units", between = '-')) %>%
addLabelOnlyMarkers(data = centroids, lng = ~cen_x_dgp, lat = ~cen_y_dgp, label = ~DGP,
labelOptions = labelOptions(noHide = TRUE, direction = 'center', textOnly = TRUE))
stock_data[order(-stock_data$Total),]
PropType <- unique(data$Property.Type)
for (k in 1:length(unique(data$Property.Type))){
data_property <- data %>% filter(Property.Type==unique(data$Property.Type)[k])
res.aov <- aov(Unit.Price....psm.~Planning.Area,data=data_property)
summary(res.aov)
results <- tidy(TukeyHSD(res.aov,ordered=TRUE))
results_sorted <- results %>% separate(comparison, c("Bigger", "Smaller"),sep = "-")
rankings1 <- results_sorted %>% group_by(Bigger) %>% summarise(Count=n()) %>% arrange(desc(Count))
rankings2 <- results_sorted %>% filter(Smaller %!in% rankings1$Bigger) %>% group_by(Smaller) %>% summarise(Count=n())%>% arrange(Count)
names(rankings2)[1] <- "Bigger"
rankings <- rbind(rankings1,rankings2)
rankings$Rank <- seq.int(nrow(rankings))
rankings %<>% dplyr::select(-Count)
results_sorted <- left_join(results_sorted, rankings) %>% arrange(Rank)
rankings$TukeyRank <- NA
rankings$TukeyRank[1] <- 1
for( i in 2:nrow(rankings)){
if(results_sorted$adj.p.value[results_sorted$Bigger==rankings$Bigger[i-1]&results_sorted$Smaller == rankings$Bigger[i]]<=0.05){
rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]+1
} else if(sum((results_sorted$Smaller[results_sorted$Bigger==rankings$Bigger[i-1]&results_sorted$adj.p.value<=0.05] %in% results_sorted$Smaller[results_sorted$Bigger==rankings$Bigger[i]&results_sorted$adj.p.value>0.05])>0)){
rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]+1
} else {
rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]
}
}
Tukey_ranked <- rankings %>% dplyr::select(Bigger,TukeyRank)
names(Tukey_ranked)[1] <- "Planning.Area"
Planning_Mean <- data_property %>% group_by(Planning.Area) %>% summarise(mean= mean(Unit.Price....psm.), sd= sd(Unit.Price....psm.))
Tukey_ranked <- left_join(Tukey_ranked, Planning_Mean)
assign(paste("aov_", PropType[k], sep = ""), tidy(res.aov))
assign(paste("Tukey_", PropType[k], sep = ""), as.data.frame(Tukey_ranked))
}
Joining, by = "Bigger"Joining, by = "Planning.Area"Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"Joining, by = "Planning.Area"Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"Joining, by = "Planning.Area"Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"Joining, by = "Planning.Area"Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"Joining, by = "Planning.Area"Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"Joining, by = "Planning.Area"Column `Planning.Area` joining character vector and factor, coercing into character vector
summary(res.aov)
Df Sum Sq Mean Sq F value Pr(>F)
Planning.Area 14 2.071e+09 147906034 216.6 <2e-16 ***
Residuals 1731 1.182e+09 682842
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Tukey_Apartment
Tukey_Condominium
`Tukey_Executive Condominium`
We performed the ANOVA test to determine if all the mean PSM per planning areas were similar. At 95% Confidence level, as P value is less than 0.05, we have sufficient evidence to say to reject H0 and that the mean price per square meter(psm) of flats are significantly different. To compare the group means, a post hoc test - TUKEY test - was performed per property type. Given the extensive results output, the TUKEY results were sorted for clarity. For each property type, the differences were sorted, filtered for results with adjusted p-value <=0.05 and then ranked accordingly.
At 95% confidence intervals:
for (k in 1:length(unique(data$Property.Type))){
data_property <- data %>% filter(Property.Type==unique(data$Property.Type)[k])
res.aov <- aov(Transacted.Price....~Planning.Area,data=data_property)
summary(res.aov)
results <- tidy(TukeyHSD(res.aov,ordered=TRUE))
results_sorted <- results %>% separate(comparison, c("Bigger", "Smaller"),sep = "-")
rankings1 <- results_sorted %>% group_by(Bigger) %>% summarise(Count=n()) %>% arrange(desc(Count))
rankings2 <- results_sorted %>% filter(Smaller %!in% rankings1$Bigger) %>% group_by(Smaller) %>% summarise(Count=n())%>% arrange(Count)
names(rankings2)[1] <- "Bigger"
rankings <- rbind(rankings1,rankings2)
rankings$Rank <- seq.int(nrow(rankings))
rankings %<>% dplyr::select(-Count)
results_sorted <- left_join(results_sorted, rankings) %>% arrange(Rank)
rankings$TukeyRank <- NA
rankings$TukeyRank[1] <- 1
for( i in 2:nrow(rankings)){
if(results_sorted$adj.p.value[results_sorted$Bigger==rankings$Bigger[i-1]&results_sorted$Smaller == rankings$Bigger[i]]<=0.05){
rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]+1
} else if(sum((results_sorted$Smaller[results_sorted$Bigger==rankings$Bigger[i-1]&results_sorted$adj.p.value<=0.05] %in% results_sorted$Smaller[results_sorted$Bigger==rankings$Bigger[i]&results_sorted$adj.p.value>0.05])>0)){
rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]+1
} else {
rankings$TukeyRank[i] <- rankings$TukeyRank[i-1]
}
}
Tukey_ranked <- rankings %>% dplyr::select(Bigger,TukeyRank)
names(Tukey_ranked)[1] <- "Planning.Area"
Planning_Mean <- data_property %>% group_by(Planning.Area) %>% summarise(mean= mean(Transacted.Price....), sd= sd(Transacted.Price....))
Tukey_ranked <- left_join(Tukey_ranked, Planning_Mean)
assign(paste("aov_", PropType[k], sep = ""), tidy(res.aov))
assign(paste("Tukey_", PropType[k], sep = ""), as.data.frame(Tukey_ranked))
}
Joining, by = "Bigger"Joining, by = "Planning.Area"Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"Joining, by = "Planning.Area"Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"Joining, by = "Planning.Area"Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"Joining, by = "Planning.Area"Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"Joining, by = "Planning.Area"Column `Planning.Area` joining character vector and factor, coercing into character vectorJoining, by = "Bigger"Joining, by = "Planning.Area"Column `Planning.Area` joining character vector and factor, coercing into character vector
summary(res.aov)
Df Sum Sq Mean Sq F value Pr(>F)
Planning.Area 14 1.689e+13 1.206e+12 65.86 <2e-16 ***
Residuals 1731 3.170e+13 1.831e+10
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Tukey_Apartment
Tukey_Condominium
`Tukey_Executive Condominium`
We performed the ANOVA test to determine if the mean transacted price per planning areas were similar. At 95% Confidence level, as P value is less than 0.05, we have sufficient evidence to say to reject H0 and that the mean transacted price of flats are significantly different. To compare the group means, a post hoc test - TUKEY test - was performed for each property type. Given the extensive results output, the TUKEY results were sorted for clarity. For each property type, the differences were sorted, filtered for results with adjusted p-value <=0.05 and then ranked accordingly.
At 95% confidence intervals:
mrt <- fread("data/Planning_area_mrt_stations.csv")
mrt$`Planning Area` <- toupper(mrt$`Planning Area`)
centroids <- readOGR('data/DGP Centroids.shp')
OGR data source with driver: ESRI Shapefile
Source: "/Users/ellpeeaxe/Desktop/SMU/Y4S2/ASSR/Project/TakeHome/data/DGP Centroids.shp", layer: "DGP Centroids"
with 52 features
It has 3 fields
centroids <- spTransform(centroids, CRS("+proj=moll +ellps=WGS84"))
dgp <- readOGR("data/MP14_PLNG_AREA_WEB_PL.shp")
OGR data source with driver: ESRI Shapefile
Source: "/Users/ellpeeaxe/Desktop/SMU/Y4S2/ASSR/Project/TakeHome/data/MP14_PLNG_AREA_WEB_PL.shp", layer: "MP14_PLNG_AREA_WEB_PL"
with 55 features
It has 12 fields
dgp <- spTransform(dgp, CRS("+proj=longlat +ellps=GRS80"))
mrt2 <- merge(dgp,mrt, by.x ='PLN_AREA_N', by.y = 'Planning Area')
#All MRT Stations
mrt_pal <- colorFactor(palette= brewer.pal(15, 'RdYlGn'),
domain=c(0,1,2,3,5,6,7,8,9,10,12,14),
na.color = "#00000000")
n too large, allowed maximum for palette RdYlGn is 11
Returning the palette you asked for with that many colors
leaflet(dgp) %>% addTiles() %>% addPolygons(fillColor = ~mrt_pal(mrt2$`Total no. of stations`), weight = 2,
opacity = 1, color = "grey",
dashArray = "1", fillOpacity = 0.8) %>%
addLegend("topright", mrt_pal, values=(0:14), title= "MRT Stations",
labFormat = labelFormat(suffix = " stations")) %>%
addLabelOnlyMarkers(data = centroids, lng = ~cen_x_dgp, lat = ~cen_y_dgp, label = ~DGP,
labelOptions = labelOptions(noHide = TRUE, direction = 'center', textOnly = TRUE))
Some values were outside the color scale and will be treated as NASome values were outside the color scale and will be treated as NA
#Only current MRT stations
mrt_pal2 <- colorFactor(palette= brewer.pal(12, 'RdYlGn'),
domain=c(0,1,2,3,4,5,6,7,8,9,10,11),
na.color = "#00000000")
n too large, allowed maximum for palette RdYlGn is 11
Returning the palette you asked for with that many colors
leaflet(dgp) %>% addTiles() %>% addPolygons(fillColor = ~mrt_pal2(mrt2$`No. of operational stations`), weight = 2,
opacity = 1, color = "grey",
dashArray = "1", fillOpacity = 0.8) %>%
addLegend("topright", mrt_pal2, values=(0:11), title= "MRT Stations",
labFormat = labelFormat(suffix = " stations")) %>%
addLabelOnlyMarkers(data = centroids, lng = ~cen_x_dgp, lat = ~cen_y_dgp, label = ~DGP,
labelOptions = labelOptions(noHide = TRUE, direction = 'center', textOnly = TRUE))
mrt[order(-mrt$`No. of operational stations`),]
mrt[order(-mrt$`Total no. of stations`),]
The optimal location to purchase will depend on what we want from the property. Assuming that we are looking to purchase an upscale Flat for investment purposes, the Newton area could be an option but the mean prices are high and mostly beyond our budget. There is also only 1 MRT station and limited supply of properties on sale. A better alternative would be the River Valley planning area as it has one of the highest number of available “Apartments” within our budget of $2.5 million. However, accessibility is lower than that of the other planning areas as there are currently only 1 MRT stations with 2 stations planned. Despite this, it should still be relatively accessible as it is very close to the CBD.
If we are looking to buy a flat to live in, we will be more concerned about the accessibility as well as value for money. In this regard, we would prefer planning areas like Bedok or Toa Payoh as they are mature estate with relatively high levels of accessibility and amenities. Toa Payoh has an integrated transport hub while Bedok has many upcoming future MRT stations bringing the total to 11 stations. In addition the apartments in these areas are more affordable and we could even purchase 2 if we wanted to.
Alternatively, if we are simply looking for a larger space at the cheapest possible rates and save the rest of the money for other uses, the best option would be Jurong East or Jurong West as they have one of the lowest psm. In addition, it is also a mature estate with multiple malls and decent accessibility. There will be a total of 15 MRT stations in the area in the near future.